{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LogisticRegression parameters:\n",
      "aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\n",
      "elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)\n",
      "family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)\n",
      "featuresCol: features column name. (default: features)\n",
      "fitIntercept: whether to fit an intercept term. (default: True)\n",
      "labelCol: label column name. (default: label)\n",
      "lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)\n",
      "lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bounds vector size must beequal with 1 for binomial regression, or the number oflasses for multinomial regression. (undefined)\n",
      "maxIter: max number of iterations (>= 0). (default: 100, current: 10)\n",
      "predictionCol: prediction column name. (default: prediction)\n",
      "probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)\n",
      "rawPredictionCol: raw prediction (a.k.a. confidence) column name. (default: rawPrediction)\n",
      "regParam: regularization parameter (>= 0). (default: 0.0, current: 0.01)\n",
      "standardization: whether to standardize the training features before fitting the model. (default: True)\n",
      "threshold: Threshold in binary classification prediction, in range [0, 1]. If threshold and thresholds are both set, they must match.e.g. if threshold is p, then thresholds must be equal to [1-p, p]. (default: 0.5)\n",
      "thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0, excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold. (undefined)\n",
      "tol: the convergence tolerance for iterative algorithms (>= 0). (default: 1e-06)\n",
      "upperBoundsOnCoefficients: The upper bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)\n",
      "upperBoundsOnIntercepts: The upper bounds on intercepts if fitting under bound constrained optimization. The bound vector size must be equal with 1 for binomial regression, or the number of classes for multinomial regression. (undefined)\n",
      "weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. (undefined)\n",
      "\n"
     ]
    },
    {
     "ename": "Py4JJavaError",
     "evalue": "An error occurred while calling o286.fit.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 13 in stage 4.0 failed 1 times, most recent failure: Lost task 13.0 in stage 4.0 (TID 45, localhost, executor driver): org.apache.spark.SparkException: Python worker failed to connect back.\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)\r\n\tat org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)\r\n\tat org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:108)\r\n\tat org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:337)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:335)\r\n\tat org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)\r\n\tat org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)\r\n\tat org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)\r\n\tat org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:286)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:121)\r\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)\r\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\r\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)\r\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\r\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\r\n\tat java.lang.Thread.run(Thread.java:748)\r\nCaused by: java.net.SocketTimeoutException: Accept timed out\r\n\tat java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)\r\n\tat java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)\r\n\tat java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)\r\n\tat java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)\r\n\tat java.net.ServerSocket.implAccept(ServerSocket.java:545)\r\n\tat java.net.ServerSocket.accept(ServerSocket.java:513)\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)\r\n\t... 53 more\r\n\nDriver stacktrace:\r\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)\r\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)\r\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)\r\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\r\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\r\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)\r\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)\r\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)\r\n\tat scala.Option.foreach(Option.scala:257)\r\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)\r\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\r\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\r\n\tat org.apache.spark.rdd.RDD.fold(RDD.scala:1092)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\r\n\tat org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)\r\n\tat org.apache.spark.ml.classification.LogisticRegression$$anonfun$train$1.apply(LogisticRegression.scala:520)\r\n\tat org.apache.spark.ml.classification.LogisticRegression$$anonfun$train$1.apply(LogisticRegression.scala:494)\r\n\tat org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:183)\r\n\tat scala.util.Try$.apply(Try.scala:192)\r\n\tat org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:183)\r\n\tat org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:494)\r\n\tat org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:489)\r\n\tat org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:279)\r\n\tat org.apache.spark.ml.Predictor.fit(Predictor.scala:118)\r\n\tat org.apache.spark.ml.Predictor.fit(Predictor.scala:82)\r\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\r\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\r\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\r\n\tat java.lang.reflect.Method.invoke(Method.java:498)\r\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\r\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\r\n\tat py4j.Gateway.invoke(Gateway.java:282)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Thread.java:748)\r\nCaused by: org.apache.spark.SparkException: Python worker failed to connect back.\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)\r\n\tat org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)\r\n\tat org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:108)\r\n\tat org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:337)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:335)\r\n\tat org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)\r\n\tat org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)\r\n\tat org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)\r\n\tat org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:286)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:121)\r\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)\r\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\r\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)\r\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\r\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\r\n\t... 1 more\r\nCaused by: java.net.SocketTimeoutException: Accept timed out\r\n\tat java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)\r\n\tat java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)\r\n\tat java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)\r\n\tat java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)\r\n\tat java.net.ServerSocket.implAccept(ServerSocket.java:545)\r\n\tat java.net.ServerSocket.accept(ServerSocket.java:513)\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)\r\n\t... 53 more\r\n",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mPy4JJavaError\u001b[0m                             Traceback (most recent call last)",
      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_2576\\2879993100.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     38\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     39\u001b[0m \u001b[1;31m# Learn a LogisticRegression model. This uses the parameters stored in lr.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 40\u001b[1;33m \u001b[0mmodel1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtraining\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     41\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     42\u001b[0m \u001b[1;31m# Since model1 is a Model (i.e., a transformer produced by an Estimator),\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\Users\\Min\\Anaconda3\\envs\\pyspark2\\lib\\site-packages\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, dataset, params)\u001b[0m\n\u001b[0;32m    130\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    131\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 132\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    133\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    134\u001b[0m             raise ValueError(\"Params must be either a param map or a list/tuple of param maps, \"\n",
      "\u001b[1;32md:\\Users\\Min\\Anaconda3\\envs\\pyspark2\\lib\\site-packages\\pyspark\\ml\\wrapper.py\u001b[0m in \u001b[0;36m_fit\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m    293\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    294\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 295\u001b[1;33m         \u001b[0mjava_model\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit_java\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    296\u001b[0m         \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_create_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjava_model\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    297\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_copyValues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\Users\\Min\\Anaconda3\\envs\\pyspark2\\lib\\site-packages\\pyspark\\ml\\wrapper.py\u001b[0m in \u001b[0;36m_fit_java\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m    290\u001b[0m         \"\"\"\n\u001b[0;32m    291\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_transfer_params_to_java\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 292\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_java_obj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    293\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    294\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\Users\\Min\\Anaconda3\\envs\\pyspark2\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m   1255\u001b[0m         \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1256\u001b[0m         return_value = get_return_value(\n\u001b[1;32m-> 1257\u001b[1;33m             answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[0;32m   1258\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1259\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\Users\\Min\\Anaconda3\\envs\\pyspark2\\lib\\site-packages\\pyspark\\sql\\utils.py\u001b[0m in \u001b[0;36mdeco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m     61\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     62\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 63\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     64\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     65\u001b[0m             \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\Users\\Min\\Anaconda3\\envs\\pyspark2\\lib\\site-packages\\py4j\\protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[1;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[0;32m    326\u001b[0m                 raise Py4JJavaError(\n\u001b[0;32m    327\u001b[0m                     \u001b[1;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 328\u001b[1;33m                     format(target_id, \".\", name), value)\n\u001b[0m\u001b[0;32m    329\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    330\u001b[0m                 raise Py4JError(\n",
      "\u001b[1;31mPy4JJavaError\u001b[0m: An error occurred while calling o286.fit.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 13 in stage 4.0 failed 1 times, most recent failure: Lost task 13.0 in stage 4.0 (TID 45, localhost, executor driver): org.apache.spark.SparkException: Python worker failed to connect back.\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)\r\n\tat org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)\r\n\tat org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:108)\r\n\tat org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:337)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:335)\r\n\tat org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)\r\n\tat org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)\r\n\tat org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)\r\n\tat org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:286)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:121)\r\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)\r\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\r\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)\r\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\r\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\r\n\tat java.lang.Thread.run(Thread.java:748)\r\nCaused by: java.net.SocketTimeoutException: Accept timed out\r\n\tat java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)\r\n\tat java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)\r\n\tat java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)\r\n\tat java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)\r\n\tat java.net.ServerSocket.implAccept(ServerSocket.java:545)\r\n\tat java.net.ServerSocket.accept(ServerSocket.java:513)\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)\r\n\t... 53 more\r\n\nDriver stacktrace:\r\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)\r\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)\r\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)\r\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\r\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\r\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)\r\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)\r\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)\r\n\tat scala.Option.foreach(Option.scala:257)\r\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)\r\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\r\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\r\n\tat org.apache.spark.rdd.RDD.fold(RDD.scala:1092)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\r\n\tat org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)\r\n\tat org.apache.spark.ml.classification.LogisticRegression$$anonfun$train$1.apply(LogisticRegression.scala:520)\r\n\tat org.apache.spark.ml.classification.LogisticRegression$$anonfun$train$1.apply(LogisticRegression.scala:494)\r\n\tat org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:183)\r\n\tat scala.util.Try$.apply(Try.scala:192)\r\n\tat org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:183)\r\n\tat org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:494)\r\n\tat org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:489)\r\n\tat org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:279)\r\n\tat org.apache.spark.ml.Predictor.fit(Predictor.scala:118)\r\n\tat org.apache.spark.ml.Predictor.fit(Predictor.scala:82)\r\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\r\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\r\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\r\n\tat java.lang.reflect.Method.invoke(Method.java:498)\r\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\r\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\r\n\tat py4j.Gateway.invoke(Gateway.java:282)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Thread.java:748)\r\nCaused by: org.apache.spark.SparkException: Python worker failed to connect back.\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)\r\n\tat org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)\r\n\tat org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:108)\r\n\tat org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:337)\r\n\tat org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:335)\r\n\tat org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)\r\n\tat org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)\r\n\tat org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)\r\n\tat org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:286)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:121)\r\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)\r\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\r\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)\r\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\r\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\r\n\t... 1 more\r\nCaused by: java.net.SocketTimeoutException: Accept timed out\r\n\tat java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)\r\n\tat java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)\r\n\tat java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)\r\n\tat java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)\r\n\tat java.net.ServerSocket.implAccept(ServerSocket.java:545)\r\n\tat java.net.ServerSocket.accept(ServerSocket.java:513)\r\n\tat org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)\r\n\t... 53 more\r\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "os.environ['HADOOP_HOME'] = r'D:\\Hadoop'\n",
    "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n",
    "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook'\n",
    "os.environ['PYSPARK_PYTHON'] = 'python'\n",
    "\n",
    "from pyspark.sql import SparkSession\n",
    "import pyspark.sql.types as T\n",
    "import pyspark.sql.functions as F\n",
    "from pyspark.sql.window import Window\n",
    "\n",
    "spark = (SparkSession\n",
    "            .Builder()\n",
    "            .appName(\"Protential Friend Recommendation\")\n",
    "            .master(\"local[*]\")\n",
    "            .config(\"spark.executor.memory\", \"64g\")\n",
    "            .config(\"spark.driver.memory\", \"64g\")\n",
    "            .getOrCreate())\n",
    "sc = spark.sparkContext\n",
    "spark.sparkContext.setLogLevel(\"ERROR\")\n",
    "\n",
    "\n",
    "from pyspark.ml.linalg import Vectors\n",
    "from pyspark.ml.classification import LogisticRegression\n",
    "\n",
    "# Prepare training data from a list of (label, features) tuples.\n",
    "training = spark.createDataFrame([\n",
    "    (1.0, Vectors.dense([0.0, 1.1, 0.1])),\n",
    "    (0.0, Vectors.dense([2.0, 1.0, -1.0])),\n",
    "    (0.0, Vectors.dense([2.0, 1.3, 1.0])),\n",
    "    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], [\"label\", \"features\"])\n",
    "\n",
    "# Create a LogisticRegression instance. This instance is an Estimator.\n",
    "lr = LogisticRegression(maxIter=10, regParam=0.01)\n",
    "# Print out the parameters, documentation, and any default values.\n",
    "print(\"LogisticRegression parameters:\\n\" + lr.explainParams() + \"\\n\")\n",
    "\n",
    "# Learn a LogisticRegression model. This uses the parameters stored in lr.\n",
    "model1 = lr.fit(training)\n",
    "\n",
    "# Since model1 is a Model (i.e., a transformer produced by an Estimator),\n",
    "# we can view the parameters it used during fit().\n",
    "# This prints the parameter (name: value) pairs, where names are unique IDs for this\n",
    "# LogisticRegression instance.\n",
    "print(\"Model 1 was fit using parameters: \")\n",
    "print(model1.extractParamMap())\n",
    "\n",
    "# We may alternatively specify parameters using a Python dictionary as a paramMap\n",
    "paramMap = {lr.maxIter: 20}\n",
    "paramMap[lr.maxIter] = 30  # Specify 1 Param, overwriting the original maxIter.\n",
    "# Specify multiple Params.\n",
    "paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55})  # type: ignore\n",
    "\n",
    "# You can combine paramMaps, which are python dictionaries.\n",
    "# Change output column name\n",
    "paramMap2 = {lr.probabilityCol: \"myProbability\"}\n",
    "paramMapCombined = paramMap.copy()\n",
    "paramMapCombined.update(paramMap2)  # type: ignore\n",
    "\n",
    "# Now learn a new model using the paramMapCombined parameters.\n",
    "# paramMapCombined overrides all parameters set earlier via lr.set* methods.\n",
    "model2 = lr.fit(training, paramMapCombined)\n",
    "print(\"Model 2 was fit using parameters: \")\n",
    "print(model2.extractParamMap())\n",
    "\n",
    "# Prepare test data\n",
    "test = spark.createDataFrame([\n",
    "    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),\n",
    "    (0.0, Vectors.dense([3.0, 2.0, -0.1])),\n",
    "    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], [\"label\", \"features\"])\n",
    "\n",
    "# Make predictions on test data using the Transformer.transform() method.\n",
    "# LogisticRegression.transform will only use the 'features' column.\n",
    "# Note that model2.transform() outputs a \"myProbability\" column instead of the usual\n",
    "# 'probability' column since we renamed the lr.probabilityCol parameter previously.\n",
    "prediction = model2.transform(test)\n",
    "result = prediction.select(\"features\", \"label\", \"myProbability\", \"prediction\") \\\n",
    "    .collect()\n",
    "\n",
    "for row in result:\n",
    "    print(\"features=%s, label=%s -> prob=%s, prediction=%s\"\n",
    "          % (row.features, row.label, row.myProbability, row.prediction))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.7.13 ('pyspark2')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "48e87896ed641b154b37119b199fe5cbef001faa1d3cf1f339fb46433cd23b9b"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
